import os

# 这里的当前路径，是父路径
current_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))

from spire.pdf.common import *
from spire.pdf import *


# spire.pdf pdf转换为html
def spire_convert_html(pdf_path, html_path):
    try:
        # 创建一个 PdfDocument 类的对象
        doc = PdfDocument()
        # 加载一个 PDF 文档
        doc.LoadFromFile(pdf_path)
        # 将文档转换为 HTML 格式并保存
        doc.SaveToFile(html_path, FileFormat.HTML)
        doc.Close()
    except Exception as e:
        print(e)


import fitz
from tqdm import tqdm


# PyMuPDF(fitz) pdf转换为html
def fitz_convert_html(pdf_path, html_path):
    try:
        doc = fitz.open(pdf_path)
        html_content = ("<!DOCTYPE html><html lang=\"zh-CN\"><head><meta "
                        "charset=\"UTF-8\"><title>Title</title></head><body>")
        # tqdm是Python进度条库, 若不用: for page in doc
        for page in tqdm(doc):
            html_content += page.get_text('html')
            html_content += "</body></html>"
            # 保存html
        with open(html_path, 'w', encoding='utf8') as f:
            f.write(html_content)
    except Exception as e:
        print(e)


# from PIL import Image
# from pdf2image import convert_from_path
# from weasyprint import HTML
#
#
# def weasyprint_convert_html(pdf_path, html_path):
#     try:
#         images = convert_from_path(pdf_path)
#         html_content = "<!DOCTYPE html><html><body>"
#         for i, image in enumerate(images):
#             image.save(f'page_{i}.png')
#             html_content += f'<img src="page_{i}.png">'
#         html_content += "</body></html>"
#         HTML(string=html_content).write_pdf("output.pdf")
#         with open(html_path, "w", encoding="utf-8") as f:
#             f.write(html_content)
#     except Exception as e:
#         print(e)


if __name__ == "__main__":
    pdf_path = os.path.join(current_path, "pdf", "00A9D1859B7B43C1A8D4D9C2BCDB0773.pdf")
    html_path = os.path.join(current_path, "html", "00A9D1859B7B43C1A8D4D9C2BCDB0773.html")
    # spire_convert_html(pdf_path, html_path)
    fitz_convert_html(pdf_path, html_path)
    # weasyprint_convert_html(pdf_path, html_path)
